The following datasets are used in this demonstration:
Population and household estimates (univariate)
1.1 Gender ratio
1.2 Gender by region
1.3 Population by outward postcode using choropleth map
1.4 Population by postcode parent area using choropleth map
1.5 Male/Female ratio by postcode parent area using choropleth map
House prices and earnings (multivariate)
2.1 Correlation between median house price and median household earning
2.2 Median and lower quartile house prices from 2002 to 2020 (animated)
Text analysis and visualisation
Please run this code cell to install required packages. If this does not work, please install these packages manually in your virtual environment.
# Install packages using jupyter notebook's built-in %pip function
%pip install pandas plotly geojson shapely openpyxl scipy nltk
Requirement already satisfied: pandas in d:\repository\ox-interview\venv\lib\site-packages (1.2.4) Requirement already satisfied: plotly in d:\repository\ox-interview\venv\lib\site-packages (4.14.3) Requirement already satisfied: geojson in d:\repository\ox-interview\venv\lib\site-packages (2.5.0) Requirement already satisfied: shapely in d:\repository\ox-interview\venv\lib\site-packages (1.7.1) Requirement already satisfied: openpyxl in d:\repository\ox-interview\venv\lib\site-packages (3.0.7) Requirement already satisfied: scipy in d:\repository\ox-interview\venv\lib\site-packages (1.6.3) Requirement already satisfied: nltk in d:\repository\ox-interview\venv\lib\site-packages (3.6.2) Requirement already satisfied: pytz>=2017.3 in d:\repository\ox-interview\venv\lib\site-packages (from pandas) (2021.1) Requirement already satisfied: numpy>=1.16.5 in d:\repository\ox-interview\venv\lib\site-packages (from pandas) (1.20.3) Requirement already satisfied: python-dateutil>=2.7.3 in d:\repository\ox-interview\venv\lib\site-packages (from pandas) (2.8.1) Requirement already satisfied: six>=1.5 in d:\repository\ox-interview\venv\lib\site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0) Requirement already satisfied: retrying>=1.3.3 in d:\repository\ox-interview\venv\lib\site-packages (from plotly) (1.3.3) Requirement already satisfied: et-xmlfile in d:\repository\ox-interview\venv\lib\site-packages (from openpyxl) (1.1.0) Requirement already satisfied: tqdm in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (4.60.0) Requirement already satisfied: regex in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (2021.4.4) Requirement already satisfied: click in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (8.0.0) Requirement already satisfied: joblib in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (1.0.1) Requirement already satisfied: colorama in d:\repository\ox-interview\venv\lib\site-packages (from click->nltk) (0.4.4) Note: you may need to restart the kernel to use updated packages.
Import packages to be used and replace default plotting backend.
# Import required packages
import pandas as pd
import numpy as np
import json
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
import shapely.geometry
from shapely.ops import unary_union
import geojson
import itertools
import wordcloud
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Set default pandas plotting backend to Plotly
pd.options.plotting.backend = "plotly"
# Load data
df1 = pd.read_csv("../data/ons/population_household.csv")
# Split postcode into outward and inward, e.g. LE3 9QP => LE3 and 9QP
df1["postcode_out"] = df1["Postcode"].apply(lambda x: x[0:4].strip())
df1["postcode_in"] = df1["Postcode"].apply(lambda x: x[4:].strip())
# Keep only non-numeric part in outward as the parent region
df1["postcode_region"] = df1["postcode_out"].apply(lambda x: re.split("\d+", x)[0])
# Remove column [Postcode] to save memory
df1 = df1.drop(["Postcode"], axis=1)
# Count total number of males and females
df_gender = pd.DataFrame({"Gender": ["Male", "Female"], "Count": [df1["Males"].sum(), df1["Females"].sum()]})
# Generate pie chart
gender_pie = px.pie(df_gender, values="Count", names="Gender", title="National gender ratio (England and Wales)")
gender_pie.show()
# Remove variables to save memory
del df_gender, gender_pie
# Sum up all numbers within same postcode region, descending order by column [Total]
df_temp = df1.groupby(["postcode_region"]).sum().sort_values(by=['Total'], ascending=False).reset_index()
# Generate plots side by side
fig_temp = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_xaxes=True,
shared_yaxes=False, vertical_spacing=0.001)
# Add plot 1: bar chart for top 10 population regions
fig_temp.append_trace(go.Bar(
x=df_temp.head(10)["Total"],
y=df_temp.head(10)["postcode_region"],
marker=dict(
color='rgba(50, 171, 96, 0.6)',
line=dict(
color='rgba(50, 171, 96, 1.0)',
width=1),
),
name='Population',
orientation='h',
), 1, 1)
# Add plot 2: scatter chart for gender ratio in top 10 population regions
fig_temp.append_trace(go.Scatter(
x=df_temp.head(10)["Males"]/df_temp.head(10)["Females"], y=df_temp.head(10)["postcode_region"],
mode='lines+markers',
line_color='rgb(128, 0, 128)',
name='M/F ratio',
), 1, 2)
# Update layout changes on plot title, axes, color and legends
fig_temp.update_layout(
title='Male/Female ratio in top 10 population regions',
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=True,
domain=[0, 0.85],
),
yaxis2=dict(
showgrid=False,
showline=True,
showticklabels=False,
linecolor='rgba(102, 102, 102, 0.8)',
linewidth=2,
domain=[0, 0.85],
),
xaxis=dict(
zeroline=False,
showline=False,
showticklabels=True,
showgrid=True,
domain=[0, 0.42],
),
xaxis2=dict(
zeroline=False,
showline=False,
showticklabels=True,
showgrid=True,
domain=[0.47, 1],
side='top',
dtick=25000,
),
legend=dict(x=0.029, y=1.038, font_size=10),
margin=dict(l=100, r=20, t=70, b=70),
paper_bgcolor='rgb(248, 248, 255)',
plot_bgcolor='rgb(248, 248, 255)',
)
# Add annotations to the plot
annotations = []
# Adding labels to charts
for a_mf, a_pop, x_pcr in zip(np.round(df_temp.head(10)["Males"]/df_temp.head(10)["Females"], decimals=2), df_temp.head(10)["Total"], df_temp.head(10)["postcode_region"]):
# Add label to M/F ratio scatter plot
annotations.append(dict(xref='x2', yref='y2',
y=x_pcr, x=a_mf,
text=a_mf,
xshift=50,
showarrow=False))
# Add label to population bar chart
annotations.append(dict(xref='x1', yref='y1',
y=x_pcr, x=a_pop,
text=f"{np.round(a_pop/1000000, 3)}M",
xshift=25,
showarrow=False))
fig_temp.update_layout(annotations=annotations)
fig_temp.show()
# Remove variables to save memory
del df_temp, annotations, fig_temp
Concat individual postcode geojson mapping into single variable geojson_uk.
# Create parent geojson collection object
geojson_uk_granulated = dict(type="FeatureCollection", features=list())
# Load all geojson files
path_geojson = Path("../data/geojson")
for f_geojson in list(path_geojson.glob("*.geojson")):
with open(f_geojson) as f:
geojson_data = json.load(f)
for feature in geojson_data["features"]:
# Add feature id using properties.name, which is the outward postcode
feature["id"] = feature["properties"]["name"]
# Add feature to parent geojson collection
geojson_uk_granulated["features"].append(feature)
Plot population with postcode area. Please note that as the opensource geographical data I used in this demonstration comes from Wikipedia, it does not cover all England regions. This leads to white areas on the map.
# Sum up all numbers within same postcode outward area
df_temp = df1.groupby(["postcode_out"]).sum().reset_index()
# Get max population number among areas
p_max = df_temp["Total"].max()
# Plot figure
fig = px.choropleth_mapbox(df1.groupby(["postcode_out"]).sum().reset_index(), geojson=geojson_uk_granulated,
locations='postcode_out', color='Total',
color_continuous_scale="jet",
range_color=(0, p_max),
mapbox_style="carto-positron",
zoom=4.5, center={"lat": 52.5, "lon": -1.6},
opacity=0.5
)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()
# Remove variables to save memory
del df_temp, p_max, fig